AML2019
Challenge 1
The model's output: a prediction of a house pricing
!pip3 install --user xgboost
!pip3 install --user missingno
# Import Libraries
import pandas as pd
import numpy as np
import math
import scipy.stats as ss
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
from xgboost import XGBRegressor
from scipy import stats
from math import ceil
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')
import missingno as msno # Missingno package for visualizing missing data
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
%matplotlib inline
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)
urltrain = "https://raw.githubusercontent.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/master/Challenges/House_Pricing/challenge_data/train.csv"
urltest = "https://raw.githubusercontent.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/master/Challenges/House_Pricing/challenge_data/test.csv"
dftrain = pd.read_csv(urltrain)
dftest = pd.read_csv(urltest)
dftrain.shape, dftest.shape
dftrain.head()
Each row represents 1 house. There are 81 attributes: Id, MSSubClass, MSZoning, ..., and SalePrice.
dftrain.info()
dftrain.describe()
# log transformation
dftrain['LogSalePrice'] = np.log(dftrain['SalePrice'])
dftrain[['SalePrice','LogSalePrice']].describe()
plt.subplots(figsize =(20, 6))
# historam
plt.subplot(1, 3, 1)
sns.distplot(dftrain['SalePrice'], color='navy')
# boxplot
plt.subplot(1, 3, 2)
sns.boxplot(data=dftrain['SalePrice'], color='navy').set_title('SalePrice')
# historam of log
plt.subplot(1, 3, 3)
sns.distplot(dftrain['LogSalePrice'], color='navy')
# Merge the train and test data
dftotal = pd.concat((dftrain, dftest)).reset_index(drop=True)
dftotal.drop(['SalePrice','LogSalePrice'], axis=1, inplace=True)
# Checking as 1200 + 260 = 1460
print(format(dftrain.shape))
print(format(dftest.shape))
print(format(dftotal.shape))
# Calculate the missing attributes
# The results are the total numbers of attributes having missing values and percentages of non-missing values of these attribute
total = dftotal.isnull().sum().sort_values(ascending=False)
percent = (dftotal.isnull().sum()/dftotal.isnull().count()*100).sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing = missing[missing['Total']>0]
print("The below table of the attributes having missing values and percentages of non-missing values of these attributes:")
print()
print(missing)
print()
print("The number of attributes have missing values are", len(missing))
# Illustrating the missing values of each attribute having missing values
msno.matrix(dftotal[missing.index])
# Therefore,replacing those NaN with "None"
none_to_fill_col = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageFinish',
'GarageCond', 'GarageType', 'GarageQual', 'BsmtExposure', 'BsmtFinType2',
'BsmtCond', 'BsmtFinType1', 'BsmtQual', 'MasVnrType']
for col in none_to_fill_col:
dftotal[col] = dftotal[col].fillna("None")
dftrain[col] = dftrain[col].fillna("None")
dftest[col] = dftest[col].fillna("None")
# Re-check the missing attributes
total = dftotal.isnull().sum().sort_values(ascending=False)
percent = (dftotal.isnull().sum()/dftotal.isnull().count()*100).sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing = missing[missing['Total']>0]
print("The below table of the attributes having missing values and percentages of non-missing values of these attributes:")
print()
print(missing)
print()
print("The number of attributes have missing values are", len(missing))
We consider to fill up these missing values of houses with an average of total LotFrontage values of other houses with the same its Neighbborhood value.
# Print the 10 first missing LotFrontage cases
print(dftotal.loc[dftotal['LotFrontage'].isnull()][['LotFrontage','Neighborhood']][:10],'\n')
# Calculate the the average LotFrontage attributes in each Neighborhood and display them
dftemp = dftotal.groupby('Neighborhood')['LotFrontage'].mean()
display(dftemp)
# Map the value in each Neighborhood to the NaN value in LotFrotage.
dftotal.loc[dftotal['LotFrontage'].isnull(),'LotFrontage'] = dftotal['Neighborhood'].map(dftemp)
dftrain.loc[dftrain['LotFrontage'].isnull(),'LotFrontage'] = dftrain['Neighborhood'].map(dftemp)
dftest.loc[dftest['LotFrontage'].isnull(),'LotFrontage'] = dftest['Neighborhood'].map(dftemp)
# Recheck the mapping values.
dftotal[['LotFrontage','Neighborhood']].loc[[7,12,43,50]]
#Print out the Comparision between YearBuilt and GarageYrBlt Attributes
print("Display the first 5 rows just which 2 attributes YearBuilt and GarageYrBlt but GarageYrBlt having missing values")
display(dftotal.loc[dftotal['GarageYrBlt'].isnull()][['GarageYrBlt','YearBuilt']][:5])
print("Summary of the difference between YearBuilt and GarageYrBlt Attributes")
print((dftotal['GarageYrBlt']-dftotal['YearBuilt']).describe(), '\n')
# Replace the missing GarageYrBlt values with the YearBuilt values
dftotal.loc[dftotal['GarageYrBlt'].isnull(),'GarageYrBlt'] = dftotal['YearBuilt']
dftrain.loc[dftrain['GarageYrBlt'].isnull(),'GarageYrBlt'] = dftrain['YearBuilt']
dftest.loc[dftest['GarageYrBlt'].isnull(),'GarageYrBlt'] = dftest['YearBuilt']
#Comparision between after and before value
print("Display these 5 rows abow after fullfill the missing GarageYrBlt values")
display(dftotal[['GarageYrBlt','YearBuilt']].loc[[39,48,78,88,89]])
print("Summary of the difference between YearBuilt and GarageYrBlt Attributes after fullfill the missing GarageYrBlt values")
print((dftotal['GarageYrBlt']-dftotal['YearBuilt']).describe())
# Check the 'MasVnrArea' NaN attribute when compared with 'MasVnrArea'
print("Display these rows just with 2 attributes: MasVnrArea and MasVnrArea which MasVnrArea having missing values")
dftotal.loc[dftotal['MasVnrArea'].isnull()][['MasVnrType','MasVnrArea']]
# Replace MasVnrArea NaN values with 0.
dftotal['MasVnrArea'] = dftotal['MasVnrArea'].fillna(0)
dftrain['MasVnrArea'] = dftrain['MasVnrArea'].fillna(0)
dftest['MasVnrArea'] = dftest['MasVnrArea'].fillna(0)
print("Display these rows abow after fullfill the missing MasVnrArea values")
dftotal[['MasVnrType','MasVnrArea']].loc[[234,529,650,936]]
print("Display all rows having missing Electrical attribute value")
display(dftotal.loc[dftotal['Electrical'].isnull()])
print("Display categories exist of the Electrical attribute and how many houses belong to each category")
print(dftotal.groupby('Electrical')['Id'].count())
# Replace the Electrical value with SBrkr and re-check
dftotal['Electrical'] = dftotal['Electrical'].fillna('SBrkr')
dftest['Electrical'] = dftest['Electrical'].fillna('SBrkr')
(dftotal.loc[dftotal['Electrical'].isnull()])
# Check if there are any missing values left
all_data_na = (dftotal.isnull().sum() / len(dftotal)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.head())
all_data_na = (dftrain.isnull().sum() / len(dftotal)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.head())
all_data_na = (dftest.isnull().sum() / len(dftotal)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
print(missing_data.head())
#Prove that TotalBsmtSF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF
#dftotal['BsmtFinSF1'] + dftotal['BsmtFinSF2'] + dftotal['BsmtUnfSF']
print((dftotal['BsmtFinSF1'] + dftotal['BsmtFinSF2'] + dftotal['BsmtUnfSF']).describe(),'\n')
print(dftotal['TotalBsmtSF'].describe())
dftotal.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)
dftest.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)
dftotal['TotalSF'] = dftotal['TotalBsmtSF'] + dftotal['1stFlrSF'] + dftotal['2ndFlrSF'] + dftotal['GrLivArea']
#dftest['TotalSF'] = dftest['TotalBsmtSF'] + dftest['1stFlrSF'] + dftest['2ndFlrSF'] + dftest['GrLivArea']
dftotal.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea'], axis=1, inplace=True)
#dftest.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea'], axis=1, inplace=True)
dftotal['TotalBath'] = dftotal['FullBath'] + dftotal['BsmtFullBath'] + 0.5*(dftotal['HalfBath'] + dftotal['BsmtHalfBath'])
#dftest['TotalBath'] = dftest['FullBath'] + dftest['BsmtFullBath'] + 0.5*(dftest['HalfBath'] + dftest['BsmtHalfBath'])
dftotal.drop(['FullBath', 'BsmtFullBath', 'HalfBath', 'BsmtHalfBath'], axis=1, inplace=True)
#dftest.drop(['FullBath', 'BsmtFullBath', 'HalfBath', 'BsmtHalfBath'], axis=1, inplace=True)
dftotal['AgeAtSell'] = abs(dftotal['YrSold'] - dftotal['YearBuilt'])
#dftest['AgeAtSell'] = abs(dftest['YrSold'] - dftest['YearBuilt'])
dftotal['RemodAgeAtSell'] = abs(dftotal['YrSold'] - dftotal['YearRemodAdd'])
#dftest['RemodAgeAtSell'] = abs(dftest['YrSold'] - dftest['YearRemodAdd'])
dftotal.drop(['YrSold', 'YearRemodAdd','MoSold','YearBuilt'], axis=1, inplace=True)
#dftest.drop(['YrSold', 'YearRemodAdd','MoSold','YearBuilt'], axis=1, inplace=True)
# Check the Categorical and Numerical data
# Checking Numerical Data
# dftotal._get_numeric_data().info()
num_col=(dftotal.select_dtypes(include=['int64','float64']).columns)
num_col = num_col.drop(['Id'])
print("The number of categoraical variables:", len(num_col),'\n')
print(num_col)
dftotal[num_col].info()
#Statistic description of all the numerical and sort by count in order
dftotal[num_col].describe().T.sort_values(by='count')
# scatter plots
tempdata = pd.concat([dftotal[:len(dftrain)], dftrain['SalePrice']], axis=1) #get the train data - data frame
temp = pd.melt(tempdata, id_vars=['SalePrice'],value_vars=num_col)
grid = sns.FacetGrid(temp, col="variable", col_wrap=4 , height=3.0,
aspect=1.2,sharex=False, sharey=False)
grid.map(plt.scatter, "value",'SalePrice', s=3,color='navy')
plt.show()
# drop those non categorical columns
num_col = num_col.drop(['MSSubClass', 'OverallCond', 'OverallQual'])
# visualize the distribution of each numerical feature
temp = pd.melt(dftotal, value_vars=num_col)
grid = sns.FacetGrid(temp, col="variable", col_wrap=5 , height=3.0,
aspect=1.0,sharex=False, sharey=False)
grid.map(sns.distplot, "value",color='navy')
plt.show()
# Plot the boxplot of each attribute compared with the SalePrice
def chunks(l, n):
return [l[i:i + n] for i in range(0, len(l), n)]
def boxplot(df, cols, ncols):
for lst in chunks(cols, ncols):
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(10, 4), dpi=200)
sns.set(font_scale = 0.7)
for idx in range(0, len(lst)):
attr = lst[idx]
data = pd.concat([df['SalePrice'], df[attr]], axis=1)
sns.set_palette('Paired',30)
g = sns.boxplot(data=df[attr], ax=axes[idx],fliersize=0.5, linewidth=0.5, color='navy').set_title(attr)
# for item in g.get_xticklabels():
# item.set_rotation(75)
plt.tight_layout()
boxplot(tempdata, num_col, 3)
## CORRELATION MATRIX FOR NUMERICAL ATTRIBUTES
# BUILD THE CORRELATION MATRIX BETWEEN NUMERIC (int64 and float64) ATTRIBUTES
correlationMatrix = dftotal[num_col].corr(method='spearman')
# PLOT THE CORRELATION MATRIX
plt.figure(figsize=(25,25))
plt.title("Correlation matrix between numerical attributes", weight="semibold")
ax.title.set_fontsize(20)
# Build the Color Correlation Matrix
mask = np.zeros_like(correlationMatrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set(font_scale = 1.3)
g = sns.heatmap(correlationMatrix, cmap='Blues', fmt = '.2f', square = True,
mask=mask, annot=True, annot_kws={"size":14}, linewidths=1.0)
for text in g.texts:
t = float(text.get_text())
if ((t) < 0.65):
text.set_text('')
else:
text.set_text(round(t, 4))
# Build the Values Correlation Matrix
mask[np.triu_indices_from(mask)] = False
mask[np.tril_indices_from(mask)] = True
g = sns.heatmap(correlationMatrix, cmap=ListedColormap(['white']), square = True, fmt = '.1f',
linewidths=1.0, mask=mask, annot=True, annot_kws={"size":12}, cbar=False);
g.set_xticklabels(g.get_xticklabels(), rotation=60, ha="right");
#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
print("Display first 10 pairs of features with high correlation:")
correlationMatrix.where(np.triu(np.ones(correlationMatrix.shape), k=1)
.astype(np.bool)).stack().sort_values(ascending=False)[:10]
As we know, features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable. So, when two features have high correlation, we consider drop one of the two features.
sale_num_col = num_col.insert(0,'SalePrice')
correlationMatrix = tempdata[sale_num_col].corr(method='spearman').abs()
sorted_corr = (correlationMatrix.loc[:, ['SalePrice']] # Select the SalePrice line in the correlation tab
.sort_values(by='SalePrice', ascending=False).T) # Sort values by descending correlation coef with SalePrice
# Plot the heatmap
plt.figure(figsize=(27, 1))
ax = sns.heatmap(sorted_corr,
# Annotations options
annot=True, annot_kws={'size':15, 'weight':'bold'}, fmt='.2f',
# Display options
linewidths=1, cbar=False, cmap='Blues')
# Resize the labels
for label in ax.get_xticklabels()+ax.get_yticklabels():
label.set_rotation(75)
label.set_fontsize(15)
plt.title("The Correlations coefficients between SalePrice and the Numerical Attributes")
ax.title.set_fontsize(20)
plt.show()
# Checking Categorical Data and adding those attributes was previous considered as numercial attribbutes
cat_col = ["MSSubClass", "OverallCond", "OverallQual"]
dftotal[cat_col] = dftotal[cat_col].astype('object')
cat_col = dftotal.select_dtypes(include=['object']).columns
#cat_col = cat_col.insert(0,"MoSold", "YrSold", "MSSubClass", "OverallCond", "OverallQual")
print("Number of categoraical variables:", len(cat_col),'\n')
print(cat_col)
Accroding to Data Description file - the categorical data as it have two types:
Nomial categories
Ordinal Categories
Nom_cat_col = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood",
"Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1",
"Exterior2", "MasVnrType", "MasVnrArea", "Foundation", "Heating", "CentralAir", "GarageType",
"MiscFeature", "SaleType", "SaleCondition"]
Ord_cat_col = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood",
"Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1",
"Exterior2", "MasVnrType", "MasVnrArea", "Foundation", "Heating", "CentralAir", "GarageType",
"MiscFeature", "SaleType", "SaleCondition"]
print("Summary of each categorical attribute")
dftotal[cat_col].describe().T.sort_values(by=['count'],ascending=True)
# Plot the distribbution of each attributes
def chunks(l, n):
return [l[i:i + n] for i in range(0, len(l), n)]
def histplot(df, cols, ncols):
for lst in chunks(cols, ncols):
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(10, 3), dpi=200)
sns.set(font_scale = 0.5)
for idx in range(0, len(lst)):
attr = lst[idx]
data = df[attr]
sns.set_palette('Paired',30)
g = sns.countplot(x=attr, data=df, ax=axes[idx])
for item in g.get_xticklabels():
item.set_rotation(75)
plt.tight_layout()
histplot(dftotal, cat_col, 4)
# scatter plots
tempdata = pd.concat([dftotal[:len(dftrain)], dftrain['SalePrice']], axis=1) #get the train data - data frame
temp = pd.melt(tempdata, id_vars=['SalePrice'],value_vars=cat_col)
sns.set(font_scale = 0.8)
grid = sns.FacetGrid(temp, col="variable", col_wrap=4 , height=3.0,
aspect=1.2,sharex=False, sharey=False).set_xticklabels(rotation=90)
grid.map(plt.scatter, "value",'SalePrice', s=3,color='navy')
plt.show()
# Plot the boxplot of each attribute compared with the SalePrice
def chunks(l, n):
return [l[i:i + n] for i in range(0, len(l), n)]
def boxplot(df, cols, ncols):
for lst in chunks(cols, ncols):
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(10, 4), dpi=200)
sns.set(font_scale = 0.7)
for idx in range(0, len(lst)):
attr = lst[idx]
data = pd.concat([df['SalePrice'], df[attr]], axis=1)
sns.set_palette('Paired',30)
g = sns.boxplot(x=attr, y='SalePrice', data=data, ax=axes[idx], fliersize=0.5, linewidth=0.5)
for item in g.get_xticklabels():
item.set_rotation(75)
plt.tight_layout()
boxplot(tempdata, cat_col, 3)
# Define Functions
def convert(data, to):
converted = None
if to == 'array':
if isinstance(data, np.ndarray):
converted = data
elif isinstance(data, pd.Series):
converted = data.values
elif isinstance(data, list):
converted = np.array(data)
elif isinstance(data, pd.DataFrame):
converted = data.as_matrix()
elif to == 'list':
if isinstance(data, list):
converted = data
elif isinstance(data, pd.Series):
converted = data.values.tolist()
elif isinstance(data, np.ndarray):
converted = data.tolist()
elif to == 'dataframe':
if isinstance(data, pd.DataFrame):
converted = data
elif isinstance(data, np.ndarray):
converted = pd.DataFrame(data)
else:
raise ValueError("Unknown data conversion: {}".format(to))
if converted is None:
raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
else:
return converted
def cramers_v(x, y):
"""
Calculates Cramer's V statistic for categorical-categorical association.
Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
This is a symmetric coefficient: V(x,y) = V(y,x)
Original function taken from: https://stackoverflow.com/a/46498792/5863503
Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
:param x: list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
:return: float
in the range of [0,1]
"""
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
def conditional_entropy(x, y):
"""
Calculates the conditional entropy of x given y: S(x|y)
Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
:param x: list / NumPy ndarray / Pandas Series
A sequence of measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of measurements
:return: float
"""
# entropy of x given y
y_counter = Counter(y)
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
entropy = 0.0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
return entropy
def theils_u(x, y):
"""
Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association.
This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about
x, and 1 means y provides full information about x.
This is an asymmetric coefficient: U(x,y) != U(y,x)
Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient
:param x: list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
:return: float
in the range of [0,1]
"""
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = ss.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x
def correlation_ratio(categories, measurements):
"""
Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association.
Answers the question - given a continuous value of a measurement, is it possible to know which category is it
associated with?
Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means
a category can be determined with absolute certainty.
Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio
:param categories: list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
:param measurements: list / NumPy ndarray / Pandas Series
A sequence of continuous measurements
:return: float
in the range of [0,1]
"""
categories = convert(categories, 'array')
measurements = convert(measurements, 'array')
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = numerator/denominator
return eta
## Correlation matrix for categorical attributes
correlationMatrix = pd.DataFrame(index=cat_col, columns=cat_col)
for col in cat_col:
for row in cat_col:
temp = theils_u(dftotal[col], dftotal[row])
correlationMatrix[row][col] = temp
correlationMatrix[col][row] = temp
correlationMatrix = correlationMatrix.astype(float)
# Plot the correlation matrix
plt.figure(figsize=(30,30))
plt.title("Correlation matrix between categorical attributes", weight="semibold")
ax.title.set_fontsize(20)
# Build the Color Correlation Matrix
mask = np.zeros_like(correlationMatrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(correlationMatrix, cmap='Blues', fmt = '.2f', square = True,
mask=mask, annot=True, annot_kws={"size":10}, linewidths=1.0);
for text in g.texts:
t = float(text.get_text())
if ((t) < 0.5):
text.set_text('')
else:
text.set_text(round(t, 4))
# Build the Values Correlation Matrix
mask[np.triu_indices_from(mask)] = False
mask[np.tril_indices_from(mask)] = True
g = sns.heatmap(correlationMatrix, cmap=ListedColormap(['white']), square = True, fmt = '.1f',
linewidths=1.0, mask=mask, annot=True, annot_kws={"size":10}, cbar=False);
g.set_xticklabels(g.get_xticklabels(), rotation=60, ha="right");
#the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
print("Display first 10 pairs of features with high correlation:")
correlationMatrix.where(np.triu(np.ones(correlationMatrix.shape), k=1)
.astype(np.bool)).stack().sort_values(ascending=False)[:10]
# Pick out the top 20 attributes that correlated with SalePrice
sale_cat_col = cat_col.insert(0,'SalePrice')
sorted_corr = pd.DataFrame(index=['SalePrice'], columns=sale_cat_col)
for col in sale_cat_col:
sorted_corr[col]['SalePrice'] = correlation_ratio(tempdata[col], tempdata['SalePrice'])
# # Sort values by descending correlation coef with SalePrice
sorted_corr = sorted_corr.T.sort_values(by='SalePrice',ascending=False).T
sorted_corr = sorted_corr.loc['SalePrice'][:20].to_frame().T.astype(float)
# Plot the heatmap
plt.figure(figsize=(20, 1))
ax = sns.heatmap(sorted_corr,
# Annotations options
annot=True, annot_kws={'size':15, 'weight':'bold'}, fmt='.2f',
# Display options
linewidths=1, cbar=False, cmap='Blues')
# Resize the labels
for label in ax.get_xticklabels()+ax.get_yticklabels():
label.set_rotation(75)
label.set_fontsize(15)
plt.title("The Correlations coefficients between SalePrice and the top 20 Categorical Attributes")
ax.title.set_fontsize(20)
plt.show()
# create of list of dummy variables that I will drop, which will be the last
# column generated from each categorical feature
dummy_drop = []
for i in cat_col:
dummy_drop += [ i+'_'+str(dftotal[i].unique()[-1]) ]
# create dummy variables
dftotal = pd.get_dummies(dftotal,columns=cat_col)
# drop the lasdt column generated from each categorical feature
dftotal = dftotal.drop(dummy_drop,axis=1)
print(format(dftotal.shape))
dftotal[:10]
X_train = dftotal[:len(dftrain)].drop(['Id'], axis=1)
y_train = np.log(dftrain['SalePrice'])
X_test = dftotal[len(dftrain):].drop(['Id'], axis=1)
X_train.shape, y_train.shape, X_test.shape
# fit the training set only, then transform both the training and test sets
scaler = RobustScaler()
X_train[num_col]= scaler.fit_transform(X_train[num_col])
X_test[num_col]= scaler.transform(X_test[num_col])
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
imp = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = X_train.columns)
imp = imp.sort_values(['Importance'], ascending = False)
print(imp)
print("Display the level of importance of the attributes:")
imp[:60].sort_values('Importance').plot(kind="barh",figsize=(15,25), color='navy')
plt.xticks(rotation=90)
plt.show()
Now we can use RFECV to eliminate the redundant features.
# Define a function to calculate RMSE
def rmse(y_true, y_pred):
return np.sqrt(np.mean((y_true-y_pred)**2))
# Define a function to calculate negative RMSE (as a score)
def nrmse(y_true, y_pred):
return -1.0*rmse(y_true, y_pred)
neg_rmse = make_scorer(nrmse)
estimator = XGBRegressor()
selector = RFECV(estimator, cv = 3, n_jobs = -1, scoring = neg_rmse)
selector = selector.fit(X_train, y_train)
print("The number of selected features is: {}".format(selector.n_features_))
features_kept = X_train.columns.values[selector.support_]
X_train = X_train[features_kept]
X_test = X_test[features_kept]
print("Display attributes kept:")
(features_kept)
print("Display the shape of training set and test set")
X_train.shape, y_train.shape, X_test.shape
models = [Ridge(),
Lasso(),
SVR(),
KernelRidge(),
ElasticNet(),
BayesianRidge()]
names = ['Ridge', 'Lasso', 'SVR', 'KernelRidge', 'ElasticNet','BayesianRidge']
# define cross validation strategy
def rmse_cv(model,X,y):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
return rmse
dftemp = pd.DataFrame(columns=['Parameters','Mean Test Score'], index=names)
# Run model
for name, model in zip(names, models):
score = rmse_cv(model, X_train, y_train)
dftemp['Parameters'][name] = model
dftemp['Mean Test Score'][name] = score.mean()
#dftemp['Std Test Score'][name] = score.std()
dftemp.sort_values(by='Mean Test Score')
dftemp['Parameters']['KernelRidge']
# Define a gridsearch method for hyperparameters tuning
class grid():
def __init__(self,model):
self.model = model
def grid_get(self,X,y,param_grid,modelname,resultdf):
grid_search = GridSearchCV(self.model,
param_grid,
cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(X,y)
print('Best Parameter: ', grid_search.best_params_)
print('Best RMSE: ', np.sqrt(-grid_search.best_score_))
grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
resultdf.loc[modelname] = [grid_search.best_params_,np.sqrt(-grid_search.best_score_)]
griddf = pd.DataFrame(columns=['Parameters','Mean Test Score'])
# Lasso
param_grid = {'alpha': [0.0005, 0.005, 0.01],'warm_start': [True, False],
'selection': ['cyclic', 'random'],'fit_intercept': [True, False], 'max_iter':[10000]}
grid(Lasso()).grid_get(X_train,y_train,param_grid,'GridSearch_Lasso',griddf)
# Ridge
param_grid = {'alpha': [0.5,1.0,1.5],
'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
'fit_intercept': [True, False]}
grid(Ridge()).grid_get(X_train,y_train,param_grid,'GridSearch_Ridge',griddf)
#SVR
param_grid = {'C':[3,6,9,12,15],'kernel':['rbf'],'gamma':[1/len(X_train)],'epsilon':[0.05,0.1,0.9]}
grid(SVR()).grid_get(X_train,y_train,param_grid,'GridSearch_SVR',griddf)
# Kernel Ridge
param_grid={'alpha':[0.5,0.9], 'kernel':['polynomial'],'gamma':[0.1,0.5], 'degree':[3],'coef0':[0.8,1]}
grid(KernelRidge()).grid_get(X_train,y_train,param_grid,'GridSearch_KernelRidge',griddf)
# ElasticNet
param_grid = {'alpha': [0.5, 1.0, 1.5],
'l1_ratio': [0.3, 0.5, 0.9],
'selection': ['cyclic', 'random'],
'fit_intercept': [True, False],'max_iter':[10000]}
grid(ElasticNet()).grid_get(X_train,y_train,param_grid,'GridSearch_ElasticNet',griddf)
# BayesianRidge
param_grid = {'tol':[0.01,0.001,0.0009], 'alpha_1':[1e-05,1e-6, 1e-7],
'alpha_2':[1e-05,1e-6, 1e-7], 'lambda_1':[1e-05,1e-6, 1e-7],
'lambda_2':[1e-05,1e-6, 1e-7], 'n_iter':[100000]}
grid(BayesianRidge()).grid_get(X_train,y_train, param_grid,'GridSearch_BayesianRidge',griddf)
griddf.sort_values(by='Mean Test Score')
# Define a RandomizedSearch method for hyperparameters tuning
class grid():
def __init__(self,model):
self.model = model
def random_get(self,X,y,param_grid,modelname,resultdf):
random_search = RandomizedSearchCV(self.model,
param_grid,
cv=5,
scoring='neg_mean_squared_error',
n_jobs = -1,
n_iter = 1000,
random_state=0)
random_search.fit(X,y)
print('Best Parameter: ', random_search.best_params_)
print('Best RMSE: ', np.sqrt(-random_search.best_score_))
random_search.cv_results_['mean_test_score'] = np.sqrt(-random_search.cv_results_['mean_test_score'])
resultdf.loc[modelname] = [random_search.best_params_,np.sqrt(-random_search.best_score_)]
randdf = pd.DataFrame(columns=['Parameters','Mean Test Score'])
#Lasso
param_grid = {'alpha': ss.uniform(0.0002,0.0003)}
grid(Lasso()).random_get(X_train,y_train,param_grid,'Randomized_Lasso',randdf)
#Ridge
param_grid = {'alpha':ss.uniform(0.9,1)}
grid(Ridge()).random_get(X_train,y_train,param_grid,'Randomized_Ridge',randdf)
#SVR
param_grid = {'C':sp_randint(14,18),'kernel':['rbf'],'gamma':[1/len(X_train)],'epsilon':ss.uniform(0.008,0.009)}
grid(SVR()).random_get(X_train,y_train,param_grid,'Randomized_SVR',randdf)
#KernelRidge
param_grid = {'alpha': ss.uniform(0.05, 1.0), 'kernel': ['polynomial'], 'gamma':ss.uniform(0.1,0.5),
'degree': [2], 'coef0':uniform(0.5, 3.5)}
grid(KernelRidge()).random_get(X_train,y_train,param_grid,'Randomized_KernelRidge',randdf)
#ElasticNet
param_grid = {'alpha':ss.uniform(0.3,0.5),'l1_ratio':ss.uniform(0.1,0.2)}
grid(ElasticNet()).random_get(X_train,y_train,param_grid,'Randomized_ElasticNet',randdf)
#Bayesian Ridge
param_grid = {'tol':[0.001], 'alpha_1':ss.gamma(1e-7, 1e-8),
'alpha_2':ss.gamma(1e-4, 1e-5), 'lambda_1':ss.gamma(1e-4, 1e-5),
'lambda_2':ss.gamma(1e-4, 1e-5)}
grid(BayesianRidge()).random_get(X_train,y_train, param_grid,'Randomized_BayesianRidge',randdf)
randdf
ranking = pd.concat([griddf, randdf])
ranking
ranking = ranking.sort_values(by='Mean Test Score')
ranking
model = Lasso(alpha= 0.0003366332448350081)
model.fit(X_train,y_train)
# Make predictions on the test set
y_pred = np.exp(model.predict(X_test))
output = pd.DataFrame({'Id': dftest['Id'], 'SalePrice': y_pred})
output.to_csv('prediction.csv', index=False)